Author: Amanda Everitt
Began: 8/24/2018
Finished:: 8/25/2018

[Goal]

Investigate the clustering of neuronal cells in particular and Tbr1+ neuronal cells without the background noise of the other cellular subtypes. First, lets look at clustering of neuronal cells and see if it as homogenous as the first tSNE makes it appear. Next lets see if this pattern is retained in cells noticeably expressing Tbr1 transcripts.

[Neuronal Cells]

Step 1: Load Data for “Neuronal cells” (Cluster 0,1,2)

  • only neuronal cells (Cluster 0,1,2) and using the raw data.
load(file=paste0(wd, "/outputs/output_02/experiment_aggregate.0.3.RData"))
neuronal_cells <- rownames(experiment.aggregate.0.3@meta.data[experiment.aggregate.0.3@meta.data$res.0.3 %in% c(0,1,2),])
experiment.aggregate <- SubsetData(experiment.aggregate.0.3, cells.use = neuronal_cells, do.clean = T)
experiment.aggregate
## An object of class seurat in project Siavash scRNA 
##  14933 genes across 12269 samples.

Step 2: Remove cells with extreme values of genes/umi for this subset

p1a <- ggplot(experiment.aggregate@meta.data, aes(x=nUMI)) + 
  geom_histogram(binwidth=50, color="black", fill="white") + ggtitle("Distribution of nUMI") +
  geom_vline(aes(xintercept=4000),color="red") + theme_bw()
p1b <- VlnPlot(experiment.aggregate, "nUMI", do.return = TRUE, group.by = "orig.ident") + 
  geom_hline(yintercept = 4000, color= "red")
p1b <- p1b + ggtitle("Distribution of nUMI by genotype") + xlab("Genotype") + ylab("count") + theme_bw() + 
  theme(plot.title = element_text(color="black", size=12),
        axis.title.x = element_text(color="black", size=12),
        axis.title.y = element_text(color="black", size=12),
        legend.position = "none")

p2a <- ggplot(experiment.aggregate@meta.data, aes(x=nGene)) + 
  geom_histogram(binwidth=50, color="black", fill="white") + ggtitle("Distribution of nGene") +
  geom_vline(aes(xintercept=500),color="red") + geom_vline(aes(xintercept=1700),color="red") + theme_bw()
p2b <- VlnPlot(experiment.aggregate, "nGene", do.return = TRUE, group.by = "orig.ident") +
  geom_hline(yintercept = 1700, color= "red")
p2b <- p2b + ggtitle("Distribution of nGene by genotype") + xlab("Genotype") + ylab("count") + theme_bw() + 
  theme(plot.title = element_text(color="black", size=12),
        axis.title.x = element_text(color="black", size=12),
        axis.title.y = element_text(color="black", size=12),
        legend.position = "none")

p3a <- ggplot(experiment.aggregate@meta.data, aes(x=percent.mito)) + 
  geom_histogram(binwidth=0.01, color="black", fill="white") + ggtitle("Distribution of mito percentage") +
  geom_vline(aes(xintercept=0.3),color="red") + theme_bw()
p3b <- VlnPlot(experiment.aggregate, "percent.mito", do.return = TRUE, group.by = "orig.ident") +
  geom_hline(yintercept = 0.299, color= "red")
p3b <- p3b + ggtitle("Distribution of mito % by genotype") + xlab("Genotype") + ylab("count") + theme_bw() + 
  theme(plot.title = element_text(color="black", size=12),
        axis.title.x = element_text(color="black", size=12),
        axis.title.y = element_text(color="black", size=12),
        legend.position = "none")

grid.arrange(p1a,p2a, p3a, p1b, p2b, p3b, ncol=3)

experiment.aggregate <- FilterCells(object = experiment.aggregate, subset.names = "nGene", 
                                    low.thresholds = 500 , high.thresholds = 1700)
experiment.aggregate <- FilterCells(object = experiment.aggregate, subset.names = "nUMI", 
                                    low.thresholds = -Inf , high.thresholds = 4000)
experiment.aggregate
## An object of class seurat in project Siavash scRNA 
##  14933 genes across 11943 samples.

Step 3: Remove genes that are non-informative/not well represented/not variable

  • Removing mitochondrial, ribosomal, or psuedogenes
  • Removing genes that don’t occur in 1% of cells (~12)
  • Removing genes with variation below median variation across all genes
counts = as.matrix(experiment.aggregate@raw.data[, colnames(experiment.aggregate@data)])  #Extract Raw Data

means <- rowMeans(as.matrix(experiment.aggregate@data))
vars <- rowVars(as.matrix(experiment.aggregate@data))
rData = data.frame(seuratVarGenes = rownames(counts) %in% experiment.aggregate.0.3@var.genes,
                   NumberCellsOccurIn = as.numeric(rowSums(as.matrix(experiment.aggregate@data) > 0)),
                   MeanExpr = as.numeric(means), 
                   Var = as.numeric(vars),
                   CoefofVar = vars/means^2
                   )
rownames(rData) = rownames(counts)
#Remove mitochondrial, ribosomal, or psuedogenes
pt1 <- grep("^mt-|^Rps|^Rpl|^Mrps|^Mrpl|^Gm", rownames(rData), value = T)

#Remove genes that don't occur in 1% of cells (~12)
pt2 <- rownames(rData[rData$NumberCellsOccurIn < 12,])

#Remove genes with variation below minimum
pt3 <- rownames(rData[rData$Var < median(rData$Var),])
to.remove <- c(pt1, pt2, pt3)

#Update the objects
rData <- rData[!rownames(rData) %in% to.remove, ]
counts <- counts[rownames(rData),]

#Optional plotting
#hist(rData$NumberCellsOccurIn, breaks = 50)
#hist(rData$Var, breaks = 50)

Step 4: Load data into Seurat for visualization

load(paste0(wd, "/", out_dir, "/neuronal_object.RData"))
experiment.aggregate <- CreateSeuratObject(counts, project = "neuronal cells")
experiment.aggregate@meta.data$orig.ident = gsub(".*L5|/", "", rownames(experiment.aggregate@meta.data))
experiment.aggregate <- NormalizeData(object= experiment.aggregate, normalization.method = "LogNormalize", scale.factor = 10000)
experiment.aggregate <- ScaleData(
  object = experiment.aggregate, #will populate new slot "scale data", does not change "data" slot
  do.scale = TRUE,
  do.center = TRUE, 
  vars.to.regress = c("nUMI", "nGene")) 

experiment.aggregate <- RunPCA(
  object = experiment.aggregate,
  pc.genes = rownames(experiment.aggregate@data), #use all genes
  pcs.compute = 20, do.print = F,
  maxit = 500)

experiment.aggregate <- JackStraw(
    object = experiment.aggregate, 
    num.replicate = 100, 
    num.pc = 20)

use.pcs = c(1:8)  #Chose all PCs accounting for more than 2% of variance
experiment.aggregate <- FindClusters(
    object = experiment.aggregate, 
    reduction.type = "pca", 
    dims.use = use.pcs, 
    resolution = seq(0.3,1.2,0.3), 
    print.output = FALSE, 
    force.recalc = TRUE,
    save.SNN = TRUE
)

experiment.aggregate <- SetAllIdent(experiment.aggregate, id = "res.0.3")
experiment.aggregate <- RunTSNE( object = experiment.aggregate, reduction.use = "pca", dims.use = use.pcs, do.fast = TRUE)
experiment.aggregate <- BuildClusterTree(experiment.aggregate, pcs.use = use.pcs, do.reorder = F, reorder.numeric = F, do.plot=F)
save(experiment.aggregate, file=paste0(wd, "/", out_dir, "neuronal_object.RData"))

## $Mgst3

## 
## $Foxp1

## 
## $Bcl11b

Step 4: Use Seurat to get a quick idea of what genes could be driving the seperation of cluster 3,4, and 5

cluster3 <- FindMarkers(object = experiment.aggregate, ident.1 = 3, ident.2 = c(0:2), min.pct = 0,  test.use = "wilcox")
markers_all <- FindAllMarkers(object = experiment.aggregate, min.pct = 0, thresh.use = 0, only.pos = TRUE, test.use = "wilcox")

save(list = c("cluster3", "markers_all"), file=paste0(wd, "/", out_dir, "/neuronal_markers.RData"))
load(paste0(wd, "/", out_dir, "/neuronal_markers.RData"))
top10_FC <- as.data.frame(markers_all[abs(markers_all$p_val_adj) < 0.05, ] %>% group_by(cluster) %>% top_n(10, avg_logFC))
top10_cluster3 <- as.data.frame(markers_all[abs(markers_all$p_val_adj) < 0.05, ] %>% group_by(cluster) %>% top_n(10, avg_logFC))

Cluster 4

top10_FC[top10_FC$cluster == 4,]
##            p_val avg_logFC pct.1 pct.2     p_val_adj cluster    gene
## 41  0.000000e+00 1.9430618 0.790 0.159  0.000000e+00       4  Arpp21
## 42  0.000000e+00 1.5189914 0.420 0.053  0.000000e+00       4    Rbp1
## 43  0.000000e+00 1.4957318 0.382 0.029  0.000000e+00       4 Ppp1r1b
## 44  0.000000e+00 1.4244200 0.319 0.014  0.000000e+00       4  Pcp4l1
## 45 1.922197e-300 1.1634648 0.227 0.011 1.378984e-296       4    Rxrg
## 46 2.178525e-261 1.2631225 0.961 0.672 1.562874e-257       4    Pcp4
## 47 2.675738e-256 1.4546196 0.614 0.156 1.919575e-252       4    Gng7
## 48 2.470751e-165 0.8255434 0.133 0.007 1.772517e-161       4  Crabp1
## 49 8.546886e-101 0.9474987 0.368 0.109  6.131536e-97       4  Bcl11b
## 50  5.781851e-82 0.8736184 0.305 0.091  4.147900e-78       4 Carhsp1
FeaturePlot(experiment.aggregate, features.plot = c("Arpp21","Rbp1","Rxrg","Bcl11b"), cols.use = c("lightblue", "red"), nCol = 2, no.legend = F)

Looks like cluster 4 could be another sub-type. Do any of these genes look like anything in particular to you both?

Cluster 5

top10_FC[top10_FC$cluster == 5,]
##            p_val avg_logFC pct.1 pct.2     p_val_adj cluster    gene
## 51  0.000000e+00  3.000307 0.583 0.011  0.000000e+00       5    Xist
## 52 4.932569e-149  1.344320 0.350 0.017 3.538625e-145       5     Ogt
## 53  2.377942e-95  1.283703 0.417 0.037  1.705936e-91       5   Pnisr
## 54  7.473697e-89  2.492201 0.450 0.050  5.361630e-85       5    Meg3
## 55  6.068821e-85  4.217211 0.958 0.418  4.353772e-81       5  Malat1
## 56  3.151335e-72  1.479686 0.442 0.056  2.260767e-68       5  Zbtb20
## 57  8.053503e-61  1.337414 0.242 0.020  5.777583e-57       5 Igfbpl1
## 58  1.903641e-41  1.257782 0.408 0.078  1.365672e-37       5   Rsrp1
## 59  1.890606e-39  1.243402 0.683 0.220  1.356321e-35       5     Son
## 60  2.936256e-07  2.434219 0.208 0.078  2.106470e-03       5  Hbb-bs
FeaturePlot(experiment.aggregate, features.plot = c("Xist", "Ogt", "Meg3", "Malat1"), cols.use = c("lightblue", "red"), nCol = 2, no.legend = F)

Looks like cluster 5 is mainly X-linked genes

Cluster 3

head(cluster3[order(abs(cluster3$avg_logFC), decreasing = T), ], n=15)
##                   p_val  avg_logFC pct.1 pct.2     p_val_adj
## Ptgds     5.521240e-232  1.3546391 0.278 0.043 3.960937e-228
## Malat1    1.192680e-162  0.8756627 0.713 0.367 8.556284e-159
## Hbb-bs    4.452546e-118  0.6895767 0.221 0.053 3.194257e-114
## Ubc        1.280024e-59 -0.5787463 0.341 0.525  9.182895e-56
## Ncs1       2.989169e-32 -0.5295051 0.202 0.338  2.144430e-28
## Calm2     6.628534e-234 -0.4919251 0.975 0.991 4.755310e-230
## Dynlt1a    9.002883e-65  0.4917497 0.174 0.053  6.458668e-61
## Cldn5      3.007658e-59  0.4773154 0.113 0.027  2.157694e-55
## Hba-a1     5.904735e-55  0.4700749 0.124 0.033  4.236057e-51
## Hist1h2al  1.823679e-64  0.4312643 0.111 0.024  1.308307e-60
## Gnb2l1     6.684370e-51  0.4239240 0.547 0.349  4.795367e-47
## Kif1a      1.781093e-59  0.4192938 0.755 0.562  1.277756e-55
## Capza2     4.875303e-24 -0.4107886 0.268 0.381  3.497542e-20
## Hba-a2     7.629526e-41  0.3903012 0.066 0.014  5.473422e-37
## Fau       5.465946e-159  0.3857561 0.992 0.954 3.921270e-155
FeaturePlot(experiment.aggregate, features.plot = c("Ptgds","Ubc", "Calm2","Cldn5"), cols.use = c("lightblue", "red"), nCol = 2, no.legend = F)

  • Preliminary looks like Ptgds and Ubc are upregulated in WT whereas Calm2 and Cldn5 are downregulated
  • Will follow this up with more thorough DEX

[Tbr1+ Neuronal Cells]

Step 1: Load Data for “Tbr1+ Neuronal cells” (Cluster 0,1,2)

  • only neuronal cells (Cluster 0,1,2) which have a non-zero count for Tbr1 (using the raw data)
load(paste0(wd, "/outputs/output_02/experiment_aggregate.0.3.RData"))
neuronal_cells <- rownames(experiment.aggregate.0.3@meta.data[experiment.aggregate.0.3@meta.data$res.0.3 %in% c(0,1,2),])
experiment.aggregate <- SubsetData(experiment.aggregate.0.3, cells.use = neuronal_cells, do.clean = T)
experiment.aggregate
## An object of class seurat in project Siavash scRNA 
##  14933 genes across 12269 samples.
a <- melt(experiment.aggregate@data["Tbr1",])
a$sample_type <- "WT"
a[grep("NULL", rownames(a)),]$sample_type <- "NULL"
a[grep("HET", rownames(a)),]$sample_type <- "HET"
p1 <- ggplot(a, aes(x=value, fill=sample_type)) + 
      geom_density(alpha= 0.3) +
      scale_x_continuous(name="Normalized counts per cell") +
      ylim(0,0.5) + ylab("Density") +
      ggtitle("Density of Tbr1 expression")
tbr1.cells.keep <- rownames(a[a$value > 0, , drop=F])
p1

experiment.aggregate <- SubsetData(experiment.aggregate, cells.use = tbr1.cells.keep, do.clean = T)
experiment.aggregate
## An object of class seurat in project Siavash scRNA 
##  14933 genes across 567 samples.

Step 2: Remove cells with extreme values of genes/umi for this subset

hist(experiment.aggregate@meta.data$nGene, breaks = 100);abline(v = c(500,1700), col="red")

experiment.aggregate <- FilterCells(object = experiment.aggregate, subset.names = "nGene", 
                                    low.thresholds = 500 , high.thresholds = 1700)
experiment.aggregate
## An object of class seurat in project Siavash scRNA 
##  14933 genes across 554 samples.
hist(experiment.aggregate@meta.data$nUMI, breaks = 100);abline(v=4000, col="red")

experiment.aggregate <- FilterCells(object = experiment.aggregate, subset.names = "nUMI", 
                                    low.thresholds = -Inf , high.thresholds = 4000)
experiment.aggregate
## An object of class seurat in project Siavash scRNA 
##  14933 genes across 541 samples.

Step 3: Remove genes that are non-informative/not well represented/not variable

  • Removing mitochondrial, ribosomal, or psuedogenes
  • Removing genes that don’t occur in 1% of cells (~12)
  • Removing genes with variation below median variation across all genes
counts = as.matrix(experiment.aggregate@raw.data[, colnames(experiment.aggregate@data)])  #Extract Raw Data

means <- rowMeans(as.matrix(experiment.aggregate@data))
vars <- rowVars(as.matrix(experiment.aggregate@data))
rData = data.frame(seuratVarGenes = rownames(counts) %in% experiment.aggregate.0.3@var.genes,
                   NumberCellsOccurIn = as.numeric(rowSums(as.matrix(experiment.aggregate@data) > 0)),
                   MeanExpr = as.numeric(means), 
                   Var = as.numeric(vars),
                   CoefofVar = vars/means^2
                   )
rownames(rData) = rownames(counts)
#Remove mitochondrial, ribosomal, or psuedogenes
pt1 <- grep("^mt-|^Rps|^Rpl|^Mrps|^Mrpl|^Gm", rownames(rData), value = T)

#Remove genes that don't occur in 1% of cells (~12)
pt2 <- rownames(rData[rData$NumberCellsOccurIn < 5,])

#Remove genes with variation below minimum
pt3 <- rownames(rData[rData$Var < median(rData$Var),])
to.remove <- c(pt1, pt2, pt3)

#Update the objects
rData <- rData[!rownames(rData) %in% to.remove, ]
counts <- counts[rownames(rData),]

#Optional plotting
#hist(rData$NumberCellsOccurIn, breaks = 50)
#hist(rData$Var, breaks = 50)

Step 4: Load data into Seurat for visualization

load(paste0(wd, "/", out_dir, "/Tbr1_neuronal_object.RData"))
experiment.aggregate <- CreateSeuratObject(counts, project = "Tbr1+_neuronal_cells")
experiment.aggregate@meta.data$orig.ident = gsub(".*L5|/", "", rownames(experiment.aggregate@meta.data))
experiment.aggregate <- NormalizeData(object= experiment.aggregate, normalization.method = "LogNormalize", scale.factor = 10000)
experiment.aggregate <- ScaleData(
  object = experiment.aggregate, #will populate new slot "scale data", does not change "data" slot
  do.scale = TRUE,
  do.center = TRUE, 
  vars.to.regress = c("nUMI", "nGene")) 

experiment.aggregate <- RunPCA(
  object = experiment.aggregate,
  pc.genes = rownames(experiment.aggregate@data), #use all genes
  pcs.compute = 12, do.print = F,
  maxit = 500)

experiment.aggregate <- JackStraw(
    object = experiment.aggregate, 
    num.replicate = 100, 
    num.pc = 12)

use.pcs = c(1:7)  #Chose all PCs accounting for more than 5% of variance
experiment.aggregate <- FindClusters(
    object = experiment.aggregate, 
    reduction.type = "pca", 
    dims.use = use.pcs, 
    resolution = seq(0.3,1.2,0.3), 
    print.output = FALSE, 
    force.recalc = TRUE,
    save.SNN = TRUE
)

experiment.aggregate <- SetAllIdent(experiment.aggregate, id = "res.0.3")
experiment.aggregate <- RunTSNE( object = experiment.aggregate, reduction.use = "pca", dims.use = use.pcs, do.fast = TRUE)
save(experiment.aggregate, file=paste0(out_dir, "/Tbr1_neuronal_object.RData"))

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin17.7.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS/LAPACK: /usr/local/Cellar/openblas/0.3.6_1/lib/libopenblasp-r0.3.6.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] gridExtra_2.3               reshape2_1.4.3             
##  [3] scales_1.0.0                cluster_2.1.0              
##  [5] dynamicTreeCut_1.63-1       SingleCellExperiment_1.6.0 
##  [7] SummarizedExperiment_1.14.1 DelayedArray_0.10.0        
##  [9] BiocParallel_1.18.1         matrixStats_0.55.0         
## [11] Biobase_2.44.0              GenomicRanges_1.36.1       
## [13] GenomeInfoDb_1.20.0         IRanges_2.18.3             
## [15] S4Vectors_0.22.1            BiocGenerics_0.30.0        
## [17] dplyr_0.8.3                 Seurat_2.3.4               
## [19] Matrix_1.2-17               cowplot_1.0.0              
## [21] ggplot2_3.2.1               knitr_1.25                 
## 
## loaded via a namespace (and not attached):
##   [1] Rtsne_0.15             colorspace_1.4-1       class_7.3-15          
##   [4] modeltools_0.2-22      ggridges_0.5.1         mclust_5.4.5          
##   [7] htmlTable_1.13.2       XVector_0.24.0         base64enc_0.1-3       
##  [10] rstudioapi_0.10        proxy_0.4-23           npsurv_0.4-0          
##  [13] flexmix_2.3-15         bit64_0.9-7            codetools_0.2-16      
##  [16] splines_3.6.0          R.methodsS3_1.7.1      lsei_1.2-0            
##  [19] robustbase_0.93-5      zeallot_0.1.0          Formula_1.2-3         
##  [22] jsonlite_1.6           ica_1.0-2              kernlab_0.9-27        
##  [25] png_0.1-7              R.oo_1.22.0            compiler_3.6.0        
##  [28] httr_1.4.1             backports_1.1.5        assertthat_0.2.1      
##  [31] lazyeval_0.2.2         lars_1.2               acepack_1.4.1         
##  [34] htmltools_0.4.0        tools_3.6.0            igraph_1.2.4.1        
##  [37] GenomeInfoDbData_1.2.1 gtable_0.3.0           glue_1.3.1            
##  [40] RANN_2.6.1             Rcpp_1.0.2             vctrs_0.2.0           
##  [43] gdata_2.18.0           ape_5.3                nlme_3.1-141          
##  [46] iterators_1.0.12       fpc_2.2-3              gbRd_0.4-11           
##  [49] lmtest_0.9-37          xfun_0.10              stringr_1.4.0         
##  [52] lifecycle_0.1.0        irlba_2.3.3            gtools_3.8.1          
##  [55] DEoptimR_1.0-8         zlibbioc_1.30.0        MASS_7.3-51.4         
##  [58] zoo_1.8-6              doSNOW_1.0.18          RColorBrewer_1.1-2    
##  [61] yaml_2.2.0             reticulate_1.13        pbapply_1.4-2         
##  [64] rpart_4.1-15           segmented_1.0-0        latticeExtra_0.6-28   
##  [67] stringi_1.4.3          foreach_1.4.7          checkmate_1.9.4       
##  [70] caTools_1.17.1.2       bibtex_0.4.2           Rdpack_0.11-0         
##  [73] SDMTools_1.1-221.1     rlang_0.4.0            pkgconfig_2.0.3       
##  [76] dtw_1.21-3             prabclus_2.3-1         bitops_1.0-6          
##  [79] evaluate_0.14          lattice_0.20-38        ROCR_1.0-7            
##  [82] purrr_0.3.2            labeling_0.3           htmlwidgets_1.5.1     
##  [85] bit_1.1-14             tidyselect_0.2.5       plyr_1.8.4            
##  [88] magrittr_1.5           R6_2.4.0               snow_0.4-3            
##  [91] gplots_3.0.1.1         Hmisc_4.2-0            pillar_1.4.2          
##  [94] foreign_0.8-72         withr_2.1.2            fitdistrplus_1.0-14   
##  [97] mixtools_1.1.0         RCurl_1.95-4.12        survival_2.44-1.1     
## [100] nnet_7.3-12            tsne_0.1-3             tibble_2.1.3          
## [103] crayon_1.3.4           hdf5r_1.3.0            KernSmooth_2.23-15    
## [106] rmarkdown_1.16         grid_3.6.0             data.table_1.12.4     
## [109] metap_1.1              digest_0.6.21          diptest_0.75-7        
## [112] tidyr_1.0.0            R.utils_2.9.0          munsell_0.5.0